%matplotlib notebook
%load_ext autoreload
%autoreload
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import OutlierDetection as od
df from original combine harvester with machineId 903¶# set the dataset path
inputPath = "D:/project/agata/data/olddata/"
inputFile = inputPath+"58900903_telematics_agata_v3__final.csv"
# read the dataset and drop the missing value
df_full = pd.read_csv(inputFile)
df_full.head()
#df_full.columns # List all the columns
df = df_full[["ertrag","durchsatz"]].dropna()
# show the head of the input dataset
df.head()
df_full.columns
n_partition = 20 # number of partition in each attribute
outlier_percent = 0.5 # percent of outliers
myGBOD = od.GBOD(df, n_partition=n_partition,outlier_percent=outlier_percent) # create the GBOD object
# run the function `run_GBOD()` to get the result.
result, grid_info = myGBOD.run_GBOD()
# result is the dataframe with all the information of the points.
result.head()
# `grid_info` is a dataframe with all grid information
grid_info.head()
# You can also set the different percent of outliers and write down the result as txt file
result, grid_info = myGBOD.run_GBOD(outlier_percent=0.25, writeCSV=True)
Center Bias means the difference between point center and grid center
result_cb, grid_info_cb = myGBOD.run_GBOD_center_bias(outlier_grid_frac=0.5)
result_cb.head()
grid_info_cb.head()
# Plot the histogram about number of points in each grid
myGBOD.plotGridHist(bins = 50)
myGBOD.np_in_grid.max()
# Plot the heatmap
myGBOD.plotGridHeatMap(fontsize=12)
# plot the points with different outlier_percent
myGBOD.plotOutliers()
myGBOD.plotOutliers(grid=False)
# plot the points in original dataset
myGBOD.plotOutliers_origin(outlier_percent=0.5, grid=False)
#plot the normed dataset with point center for each grid
myGBOD.plot_df_norm()
# plot the outliers after center bias method with different outlier_grid_frac
myGBOD.plotOutliers_CenterBias()
Sample a given number of records
# Sample 4000 records from df
mySample = od.sample_df(df, n=4000, random_state=1)
mySample.head()
# Sample 1% records from df
mySample = od.sample_df(df, frac=0.01, random_state=1)
mySample.head()
# sample 7000 records from df
df7000=od.sample_df(df, n=7000, random_state=1)
# initialize the GBOD Object with 15 partition on each attribute.
myGBOD2 = od.GBOD(df7000, n_partition=20, outlier_percent=1)
# get result
result,grid_info = myGBOD2.run_GBOD()
result.head()
# get the third and fourth column for the normalized dataset
myGBOD2.res.loc[:, ["ertrag_norm", "durchsatz_norm"]].head()
myGBOD2.np_in_grid.nlargest()
# plot the result
myGBOD2.plotOutliers()
# plot the result with grid
myGBOD2.plotOutliers(grid=False)
myGBOD2.plotGridHeatMap(fontsize=14)
mySample=od.sample_df(df, n=10000, random_state=1)
myKDKNN = od.KDKNN(mySample, k=5)
myKDKNN.run_KDKNN(outlier_percent=1).head()
myKDKNN.plot_outliers(grid=True, outlier_percent=1)
myGBOD3 = od.GBOD(mySample, n_partition=20, outlier_percent=5)
compare_KNKNN(outlier_percent)¶myGBOD3.compare_KDKNN()
myGBOD3.compare_KDKNN(outlier_percent=2)
%timeit myGBOD.findPartition()
%timeit od.get_KDKNN_score(df)